In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
pd.set_option('display.max_columns', None)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
In [2]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor
from matplotlib import pyplot
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from yellowbrick.regressor import ResidualsPlot
from yellowbrick.regressor import PredictionError
from yellowbrick.regressor import CooksDistance
from sklearn.neural_network import MLPRegressor
import xgboost
In [3]:
ls official_train_test_split
60-40/  70-30/  80-20/
In [4]:
df = pd.read_csv("official_train_test_split/80-20/train_set.csv")
df_test  = pd.read_csv("official_train_test_split/80-20/test_set.csv") 
In [5]:
df_test.tail(2)
Out[5]:
CountryCode Days since first case record_date c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_3_action c8_4_action e1_0_action e1_1_action e1_2_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h1_2_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action E3_Fiscal measures E4_International support H4_Emergency investment in healthcare H5_Investment in vaccines Log_new_cases_per_million Outputvalue_lag1 Outputvalue_lag2 Outputvalue_lag3 Outputvalue_lag4 Outputvalue_lag5 Outputvalue_lag6 Outputvalue_lag7 population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity weather_situation temperature windSpeed Number of Tweet Sentiments isHoliday urbanPopulation healthExpenditure Month Day_of_Month Day_of_Week
2300 NOR 41 2020-04-08 0 0 0 28 0 30 0 0 0 0 28 0 0 0 0 16 0 28 0 42 0 0 0 0 0 24 0 0 0 0 25 42 0 0 0 0 27 0 0 42 0 42 0 0 0 8 0 0.0 0.0 0.0 0.0 1.326602 1.379849 1.74591 1.70206 1.697264 1.604356 1.610298 1.359323 14.462 64800.057 114.316 5.31 86.979 3.6 19.6 20.7 82.40 13.817 0.65 partly-cloudy-day 41.09 7.75 1.0 0.6124 -1 82.616 8.928808 4 8 2
2301 FJI 70 2020-05-29 0 0 0 72 0 0 15 0 0 0 75 0 0 0 72 0 15 0 0 0 0 72 0 0 0 72 0 0 0 0 62 0 64 0 0 0 65 0 0 129 0 0 0 43 0 0 72 0.0 0.0 0.0 0.0 0.000000 0.000000 0.00000 0.00000 0.000000 0.000000 0.000000 0.000000 49.562 8702.975 412.820 14.49 65.386 2.3 10.2 34.8 67.44 4.754 0.78 cloudy 77.03 8.27 0.0 0.0000 -1 56.750 2.318574 5 29 4

Clean n transform data

In [6]:
df.record_date = pd.to_datetime(df.record_date, format='%Y-%m-%d')
df_test.record_date = pd.to_datetime(df_test.record_date, format='%Y-%m-%d')
In [7]:
icon_onehot_col = pd.get_dummies(df.weather_situation)
df = pd.concat([df, icon_onehot_col], axis=1).drop(["weather_situation"],axis=1)

icon_onehot_col = pd.get_dummies(df_test.weather_situation)
df_test = pd.concat([df_test, icon_onehot_col], axis=1).drop(["weather_situation"],axis=1)
In [8]:
boruta_1 = pd.read_excel("dataset/Boruta result.xlsx", sheet_name="Boruta result 1 ")
boruta_2 = pd.read_excel("dataset/Boruta result.xlsx", sheet_name="Boruta result 2")
In [9]:
vars_1 = boruta_1[boruta_1["Decision "] == "Confirmed"]["Variable"].values
vars_2 = boruta_2[boruta_2["Decision "] == "Confirmed"]["Variable"].values
In [10]:
df.head(2)
Out[10]:
CountryCode Days since first case record_date c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c3_2_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_3_action c8_4_action e1_0_action e1_1_action e1_2_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h1_2_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action E3_Fiscal measures E4_International support H4_Emergency investment in healthcare H5_Investment in vaccines Log_new_cases_per_million Outputvalue_lag1 Outputvalue_lag2 Outputvalue_lag3 Outputvalue_lag4 Outputvalue_lag5 Outputvalue_lag6 Outputvalue_lag7 population_density gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy aged_65_older_sum humidity temperature windSpeed Number of Tweet Sentiments isHoliday urbanPopulation healthExpenditure Month Day_of_Month Day_of_Week clear-day clear-night cloudy fog partly-cloudy-day partly-cloudy-night rain snow wind
0 CAN 112 2020-05-17 0 0 0 63 0 0 0 61 0 0 67 0 0 0 0 56 113 0 0 0 65 0 0 0 0 59 0 0 0 0 61 0 0 64 0 61 0 0 0 68 0 0 0 60 0 113 0 0.0 0.0 0.0 0.0 1.502714 1.47356 1.472786 1.493584 1.477396 1.482359 1.526288 1.602722 4.037 44017.591 105.599 7.37 89.443 2.5 12.0 16.6 82.43 13.8905 0.72 51.15 4.7 1.0 -0.361200 -1 81.482 7.794102 5 17 6 0 0 0 0 0 1 0 0 0
1 NPL 75 2020-04-09 0 0 0 22 0 0 0 19 0 0 23 0 0 0 0 19 0 0 17 0 0 18 0 0 0 19 0 0 0 0 18 0 11 0 0 0 11 0 0 73 0 88 0 0 0 0 77 0.0 0.0 0.0 0.0 0.000000 0.00000 -0.987163 0.000000 -1.468521 0.000000 0.000000 0.000000 204.430 2442.804 260.797 7.26 47.782 0.3 9.5 37.8 70.78 4.5105 0.43 72.63 2.2 3.0 0.084421 -1 20.153 1.240623 4 9 3 1 0 0 0 0 0 0 0 0

Variable 1

In [85]:
var1_new = ['Days since first case', 'clear-night',
       'partly-cloudy-night', 'temperature', 'humidity',
       'windSpeed', 'Number of Tweet', 'Sentiments', 'c1_0_action',
       'c1_1_action', 'c1_2_action', 'c1_3_action', 'c2_0_action',
       'c2_1_action', 'c2_2_action', 'c2_3_action', 'c3_0_action',
       'c3_1_action', 'c4_0_action', 'c4_1_action', 'c4_2_action',
       'c4_3_action', 'c4_4_action', 'c5_0_action', 'c5_1_action',
       'c5_2_action', 'c6_0_action', 'c6_1_action', 'c6_2_action',
       'c6_3_action', 'c7_0_action', 'c7_1_action', 'c7_2_action',
       'c8_0_action', 'c8_1_action', 'c8_2_action', 'c8_3_action',
       'c8_4_action', 'e1_0_action', 'e1_1_action', 'e1_2_action',
       'e2_0_action', 'e2_1_action', 'e2_2_action', 'h1_0_action',
       'h1_1_action', 'h1_2_action', 'h2_0_action', 'h2_1_action',
       'h2_2_action', 'h2_3_action', 'h3_0_action', 'h3_1_action',
       'h3_2_action',  'population_density',
       'urbanPopulation', 'healthExpenditure', 'gdp_per_capita',
       'cvd_death_rate', 'diabetes_prevalence', 'handwashing_facilities',
       'hospital_beds_per_thousand', 'female_smokers', 'male_smokers',
       'life_expectancy', 'Outputvalue_lag1','Outputvalue_lag2',
        'Outputvalue_lag3','Outputvalue_lag4','Outputvalue_lag5',
        'Outputvalue_lag6','Outputvalue_lag7',
        'Month','Day_of_Month','Day_of_Week','Log_new_cases_per_million']
In [86]:
df_train = df[var1_new]
df_test = df_test[var1_new]
In [87]:
print(df_train.shape, df_test.shape)
(9207, 76) (2302, 76)
In [88]:
df_train.head(2)
Out[88]:
Days since first case clear-night partly-cloudy-night temperature humidity windSpeed Number of Tweet Sentiments c1_0_action c1_1_action c1_2_action c1_3_action c2_0_action c2_1_action c2_2_action c2_3_action c3_0_action c3_1_action c4_0_action c4_1_action c4_2_action c4_3_action c4_4_action c5_0_action c5_1_action c5_2_action c6_0_action c6_1_action c6_2_action c6_3_action c7_0_action c7_1_action c7_2_action c8_0_action c8_1_action c8_2_action c8_3_action c8_4_action e1_0_action e1_1_action e1_2_action e2_0_action e2_1_action e2_2_action h1_0_action h1_1_action h1_2_action h2_0_action h2_1_action h2_2_action h2_3_action h3_0_action h3_1_action h3_2_action population_density urbanPopulation healthExpenditure gdp_per_capita cvd_death_rate diabetes_prevalence handwashing_facilities hospital_beds_per_thousand female_smokers male_smokers life_expectancy Outputvalue_lag1 Outputvalue_lag2 Outputvalue_lag3 Outputvalue_lag4 Outputvalue_lag5 Outputvalue_lag6 Outputvalue_lag7 Month Day_of_Month Day_of_Week Log_new_cases_per_million
0 112 0 1 51.15 0.72 4.7 1.0 -0.361200 0 0 0 63 0 0 0 61 0 0 0 0 0 0 56 113 0 0 0 65 0 0 0 0 59 0 0 0 0 61 0 0 64 0 61 0 0 0 68 0 0 0 60 0 113 0 4.037 81.482 7.794102 44017.591 105.599 7.37 89.443 2.5 12.0 16.6 82.43 1.47356 1.472786 1.493584 1.477396 1.482359 1.526288 1.602722 5 17 6 1.502714
1 75 0 0 72.63 0.43 2.2 3.0 0.084421 0 0 0 22 0 0 0 19 0 0 0 0 0 0 19 0 0 17 0 0 18 0 0 0 19 0 0 0 0 18 0 11 0 0 0 11 0 0 73 0 88 0 0 0 0 77 204.430 20.153 1.240623 2442.804 260.797 7.26 47.782 0.3 9.5 37.8 70.78 0.00000 -0.987163 0.000000 -1.468521 0.000000 0.000000 0.000000 4 9 3 0.000000
In [89]:
y_train = df_train['Log_new_cases_per_million']
X_train = df_train.drop('Log_new_cases_per_million', axis=1)

y_test = df_test['Log_new_cases_per_million']
X_test = df_test.drop('Log_new_cases_per_million', axis=1)

Train Model

In [90]:
def get_stacking():
    # define the base models
    level0 = list()
#     level0.append(('cart', DecisionTreeRegressor()))
    level0.append(('ranFor',RandomForestRegressor(random_state=0)))
    level0.append(('graBoost',GradientBoostingRegressor(random_state=0)))
    level0.append(('ridge', Ridge(alpha = 10,tol=0.001)))
    level0.append(('lasso', Lasso(alpha = 0.001)))
    level0.append(('linear_reg', LinearRegression()))
    # define meta learner model
    level1 = LinearRegression()
    # define the stacking ensemble
    model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)
    return model

# get a list of models to evaluate
def get_models():
    models = dict()
#     models['cart'] = DecisionTreeRegressor()
    models['ranFor'] = RandomForestRegressor(random_state=0)
    models['graBoost'] = GradientBoostingRegressor(random_state=0)
    models['ridge'] = Ridge(alpha = 0.001)
    models['lasso'] = Lasso(alpha = 0.001)
    models['linear_reg'] = LinearRegression()
    models['stacking'] = get_stacking()
    return models

R2

In [91]:
# evaluate a given model using cross-validation
def evaluate_model(model):
    cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=1)
    scores = cross_val_score(model, X_train, y_train, scoring='r2', cv=cv, n_jobs=-1, error_score='raise')
    return scores

# get the models to evaluate
models = get_models()

# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()
%matplotlib inline
>ranFor 0.779 (0.016)
>graBoost 0.777 (0.016)
>ridge 0.763 (0.017)
>lasso 0.763 (0.017)
>linear_reg 0.763 (0.017)
/home/harry/anaconda3/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py:706: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.
  "timeout or by a memory leak.", UserWarning
>stacking 0.783 (0.016)

Root mean square error

In [92]:
# evaluate a given model using cross-validation
def evaluate_model(model):
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1, error_score='raise')
    return scores

# get the models to evaluate
models = get_models()

# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()
%matplotlib inline
>ranFor -0.413 (0.016)
>graBoost -0.415 (0.017)
>ridge -0.428 (0.017)
>lasso -0.427 (0.018)
>linear_reg -0.428 (0.017)
>stacking -0.409 (0.016)

Feature Importance

In [93]:
reg = RandomForestRegressor(random_state=0)
reg.fit(X_train, y_train)
(pd.Series(reg.feature_importances_, index=X_train.columns)
   .nlargest(10)
   .plot(kind='barh'))
Out[93]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fb8adc85310>

Residuals + R2 plot

Residuals, in the context of regression models, are the difference between the observed value of the target variable (y) and the predicted value (ŷ), i.e. the error of the prediction. The residuals plot shows the difference between residuals on the vertical axis and the dependent variable on the horizontal axis, allowing you to detect regions within the target that may be susceptible to more or less error.

In [94]:
model = RandomForestRegressor(random_state=0)
visualizer = ResidualsPlot(model)
visualizer.fit(X_train, y_train)
visualizer.show()
Out[94]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fb8b137cf10>
In [96]:
from sklearn.inspection import plot_partial_dependence
reg = RandomForestRegressor(random_state=0).fit(X_train, y_train)
disp1 = plot_partial_dependence(reg, X_train,features=[1,2,3])  
In [97]:
disp1 = plot_partial_dependence(reg, X_train,features=[3,4,5])  
In [98]:
disp1 = plot_partial_dependence(reg, X_train,features=[6,7,8])  
In [99]:
disp1 = plot_partial_dependence(reg, X_train,features=[9,10,11])  
In [100]:
disp1 = plot_partial_dependence(reg, X_train,features=[12,13,14])  
In [101]:
disp1 = plot_partial_dependence(reg, X_train,features=[15,16,17,18])  

R2

In [102]:
model = RandomForestRegressor(random_state=0).fit(X_train, y_train)
model.score(X_test, y_test)
Out[102]:
0.7932450118388492

RMSE

In [103]:
from sklearn.metrics import mean_squared_error
y_pred = model.predict(X_test)
mean_squared_error(y_test, y_pred,squared=True)
Out[103]:
0.15950357146091934

Variables - 2

In [13]:
var2_ = ['Days since first case', 'clear-night',
       'partly-cloudy-day','partly-cloudy-night', 'temperature', 'humidity',
       'windSpeed', 'Number of Tweet', 'Sentiments', 'c1_0_action',
       'c1_1_action', 'c1_2_action', 'c1_3_action', 'c2_0_action',
       'c2_1_action', 'c2_2_action', 'c2_3_action', 'c3_2_action',
       'c4_0_action', 'c4_1_action', 'c4_2_action', 'c4_3_action',
       'c4_4_action', 'c5_0_action', 'c5_1_action', 'c5_2_action',
       'c6_0_action', 'c6_1_action', 'c6_2_action', 'c6_3_action',
       'c7_0_action', 'c7_1_action', 'c7_2_action', 'c8_0_action',
       'c8_1_action', 'c8_2_action', 'c8_3_action', 'c8_4_action',
       'e1_0_action', 'e1_1_action', 'e1_2_action', 'e2_0_action',
       'e2_1_action', 'e2_2_action', 'h1_0_action', 'h1_1_action',
       'h1_2_action', 'h2_0_action', 'h2_1_action', 'h2_2_action',
       'h2_3_action', 'h3_0_action', 'h3_1_action', 'h3_2_action',
       'H5_Investment in vaccines', 'population_density',
       'aged_65_older_sum', 'gdp_per_capita', 'cvd_death_rate',
       'handwashing_facilities', 'hospital_beds_per_thousand',
       'life_expectancy','Log_new_cases_per_million']
In [14]:
df_train = df[var2_]
df_test = df_test[var2_]
In [15]:
print(df_train.shape, df_test.shape)
(9207, 63) (2302, 63)
In [16]:
y_train = df_train['Log_new_cases_per_million']
X_train = df_train.drop('Log_new_cases_per_million', axis=1)

y_test = df_test['Log_new_cases_per_million']
X_test = df_test.drop('Log_new_cases_per_million', axis=1)

Train Models

In [71]:
def get_stacking():
    # define the base models
    level0 = list()
    level0.append(('xgboost',xgboost.XGBRegressor()))
    level0.append(('ranFor',RandomForestRegressor(random_state=0)))
    level0.append(('graBoost',GradientBoostingRegressor(random_state=0)))
    level0.append(('ridge', Ridge(alpha = 10,tol=0.001)))
    level0.append(('lasso', Lasso(alpha = 0.001)))
    level0.append(('linear_reg', LinearRegression()))
    # define meta learner model
    level1 = LinearRegression()
    # define the stacking ensemble
    model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)
    return model

# get a list of models to evaluate
def get_models():
    models = dict()
    models['xgboost'] = xgboost.XGBRegressor()
    models['ranFor'] = RandomForestRegressor(random_state=0)
    models['graBoost'] = GradientBoostingRegressor(random_state=0)
    models['ridge'] = Ridge(alpha = 0.001)
    models['lasso'] = Lasso(alpha = 0.001)
    models['linear_reg'] = LinearRegression()
    models['stacking'] = get_stacking()
    return models

R2

In [72]:
# evaluate a given model using cross-validation
def evaluate_model(model):
    cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=1)
    scores = cross_val_score(model, X_train, y_train, scoring='r2', cv=cv, n_jobs=-1, error_score='raise')
    return scores

# get the models to evaluate
models = get_models()

# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()
%matplotlib inline
>xgboost 0.769 (0.017)
>ranFor 0.780 (0.016)
>graBoost 0.696 (0.015)
>ridge 0.328 (0.019)
>lasso 0.328 (0.020)
>linear_reg 0.328 (0.019)
A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.
>stacking 0.786 (0.015)

Root mean squared error

In [119]:
# evaluate a given model using cross-validation
def evaluate_model(model):
    cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=1)
    scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1, error_score='raise')
    return scores

# get the models to evaluate
models = get_models()

# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()
%matplotlib inline
>ranFor -0.412 (0.015)
>graBoost -0.485 (0.014)
>ridge -0.721 (0.021)
>lasso -0.721 (0.021)
>linear_reg -0.721 (0.021)
>stacking -0.408 (0.015)

Feature Importance

In [120]:
reg = RandomForestRegressor(random_state=0)
reg.fit(X_train, y_train)
(pd.Series(reg.feature_importances_, index=X_train.columns)
   .nlargest(10)
   .plot(kind='barh'))
Out[120]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fb8acca2150>

Residual + R2 plot

In [121]:
model = RandomForestRegressor(random_state=0)
visualizer = ResidualsPlot(model)
visualizer.fit(X_train, y_train)
visualizer.show()
Out[121]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fb8ae20a9d0>

R2

In [75]:
from sklearn.metrics import r2_score
model = RandomForestRegressor(random_state=0).fit(X_train, y_train)
y_pred = model.predict(X_test)
r2_score(y_test, y_pred)
Out[75]:
0.7955888800139136
In [80]:
# model.score(X_test, y_test) # =R2

RMSE

In [81]:
from sklearn.metrics import mean_squared_error
y_pred = model.predict(X_test)
mean_squared_error(y_test, y_pred,squared=True)
Out[81]:
0.1576953667434353
In [83]:
from sklearn.inspection import plot_partial_dependence
disp1 = plot_partial_dependence(model, X_train,features=[1,2,3])  
In [84]:
disp1 = plot_partial_dependence(reg, X_train,features=[3,4,5])  
In [85]:
disp1 = plot_partial_dependence(reg, X_train,features=[6,7,8])  
In [86]:
disp1 = plot_partial_dependence(reg, X_train,features=[9,10,11])  
In [87]:
disp1 = plot_partial_dependence(reg, X_train,features=[12,13,14])  
In [88]:
disp1 = plot_partial_dependence(reg, X_train,features=[15,16,17])  
In [89]:
disp1 = plot_partial_dependence(reg, X_train,features=[18,19,20])  
In [90]:
disp1 = plot_partial_dependence(reg, X_train,features=[21,22,23])  
In [91]:
disp1 = plot_partial_dependence(reg, X_train,features=[24,25,26])  
In [92]:
disp1 = plot_partial_dependence(reg, X_train,features=[27,28,29])  
In [93]:
disp1 = plot_partial_dependence(reg, X_train,features=[30,31,32])  
In [94]:
disp1 = plot_partial_dependence(reg, X_train,features=[33,35,36])  
In [95]:
disp1 = plot_partial_dependence(reg, X_train,features=[37,38,39])  
In [96]:
disp1 = plot_partial_dependence(reg, X_train,features=[40,41,42])  
In [97]:
disp1 = plot_partial_dependence(reg, X_train,features=[43,44,45])
In [98]:
disp1 = plot_partial_dependence(reg, X_train,features=[46,47,48])
In [99]:
disp1 = plot_partial_dependence(reg, X_train,features=[49,50,51])  
In [100]:
disp1 = plot_partial_dependence(reg, X_train,features=[52,53,54])  
In [101]:
disp1 = plot_partial_dependence(reg, X_train,features=[55,56,57])  
In [102]:
disp1 = plot_partial_dependence(reg, X_train,features=[58,59,60])  

SHAP (SHapley Additive exPlanations)

In [103]:
import shap

# load JS visualization code to notebook
shap.initjs()

# explain the model's predictions using SHAP
# (same syntax works for LightGBM, CatBoost, scikit-learn and spark models)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_train)

# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[0,:], X_train.iloc[0,:])
Out[103]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.

The above explanation shows features each contributing to push the model output from the base value to the model output. Features pushing the prediction higher are shown in red, those pushing the prediction lower are in blue. The base_value here is 0.4979 while our predicted value is 0.7. Goal Scored = 2 has the biggest impact on increasing the prediction, while ball possession feature has the biggest effect in decreasing the prediction.

In [104]:
# visualize the training set predictions
shap.force_plot(explainer.expected_value, shap_values, X_train)
shap.plots.force is slow for many thousands of rows, try subsampling your data.
Out[104]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [105]:
# summarize the effects of all the features
shap.summary_plot(shap_values, X_train)

To get an overview of which features are most important for a model we can plot the SHAP values of every feature for every sample. The summary plot tells which features are most important, and also their range of effects over the dataset.

Local Surrogate (LIME)

In [106]:
import lime
import lime.lime_tabular
In [108]:
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values, feature_names=X_train.columns, verbose=True, mode='regression')
In [112]:
X_test.iloc[25]
Out[112]:
Days since first case           106.000
clear-night                       0.000
partly-cloudy-day                 1.000
partly-cloudy-night               0.000
temperature                      62.610
                                ...    
gdp_per_capita                46682.515
cvd_death_rate                  114.767
handwashing_facilities           97.164
hospital_beds_per_thousand        2.500
life_expectancy                  80.900
Name: 25, Length: 62, dtype: float64
In [109]:
i = 25
exp = explainer.explain_instance(X_test.iloc[i], model.predict, num_features=5)
Intercept 0.12229108069782985
Prediction_local [0.69854288]
Right: 0.6892780249531296
In [110]:
exp.show_in_notebook(show_table=True)
In [111]:
exp.as_list()
Out[111]:
[('gdp_per_capita > 34272.36', 0.24783541975768708),
 ('aged_65_older_sum > 12.82', 0.16215411476367797),
 ('cvd_death_rate <= 137.02', 0.15614843314903928),
 ('c2_0_action <= 0.00', 0.0993510132792419),
 ('c8_4_action <= 0.00', -0.08923718243084912)]